import requests
from selenium import webdriver
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/78.0.3904.70 Safari/537.36'}
url = "http://www.imomoe.io"  # 网站地址

driver = webdriver.Chrome()  # 启动浏览器
driver.set_page_load_timeout(20)  # 最大加载时间


def htmlurls(url_id):

    htmlurl = url + '/view/'+str(url_id)+'.html'
    r = requests.get(htmlurl, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    texts = soup.find_all('div', 'movurl')  # 获取标签
    for text in texts:
        f.write(str(text['id']))
        list_urls = []  # 全部视频网页地址
        for text_son in BeautifulSoup(str(text), 'html.parser').find_all('a'):
            new_url = url + text_son['href']  # 全部视频网页地址
            list_urls.append(new_url)

        for list_url in list_urls:
            print(list_url)

        video_src_list = []  # 全部视频src

        for video_url in list_urls:
            video_r = requests.get(video_url, headers=headers)
            if video_r.status_code == 200:  # 网页是不是200
                driver.get(url=video_url)
                html_text = driver.page_source  # 得到网页源码
                soup = BeautifulSoup(html_text, 'html.parser')
                html_text_src = soup.find('iframe', id='play2')
                print(html_text_src)
                if html_text_src is None:  #获取iframe的地址
                    return
                video_src = html_text_src['src']
                video_real_src = (video_src.split('&')[1]).split('=')[1]
                print(video_real_src)  # 解析url连接得到视频连接
                if not video_real_src.startswith('http://quan.qq.com'):
                    return
                video_src_list.append(video_real_src)
            f.write('\n\n\n')
            f.write(htmlurl+'\n')
            for t in video_src_list:
                f.write(t+'\n')
    f.write('\n\n\n\n\n\n')


if __name__ == '__main__':
    with open("url.txt", 'a+') as f:
        htmlurls(url_id=7000)
    # for url_id_t in range(2000, 2005):
    #     htmlurls(url_id=url_id_t)

